library(tidyverse)
library(DESeq2)
library(knitr)
library(kableExtra)
library(plyranges)
library(stringr)
library(biobroom)
library(ggrepel)
library(RColorBrewer)
library(gplots)
library(biomaRt)
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(mixOmics)
library(mosaic)
library(clusterProfiler)
library(DOSE)
library(org.Mm.eg.db)
library(pathview)
library(AnnotationDbi)
library(fgsea)
library(rvest)
library(ggseqlogo)
library(gridExtra)
library(rtracklayer)
library(Biostrings)
library(Rsamtools)
library(data.table)
library(grid)
library(enrichplot)
This dataset is generated using WT and KRas-G12D expressingn colonic tumor from mouse. Tumors were induced via 4-OHT enema. Mice used are - Villin-CreER/+; Apc(fl/fl) - Villin-CreER/+; Apc(fl/fl); KRasG12D/+
Alignment and peak calling was done using the bcbio ATAC-Seq pipeline following instructions described here: (https://bcbio-nextgen.readthedocs.io/en/latest/contents/atac.html)
Merged peak data are used here. This is obtained by combining all peaks from all samples, merging peaks within 500bp of each other, and using featureCount to count all reads in the peaks.
dir.create("PDF_Figure", showWarnings = FALSE)
metadata = readr::read_csv("Data/bcbio_output/metadata.csv") %>%
dplyr::rename("sample"=...1) %>%
dplyr::select(-batch, -phenotype) %>%
dplyr::filter(tissue == "colon_tumor") %>%
mutate(condition=relevel(factor(condition), ref="WT")) %>%
as.data.frame()
rownames(metadata) = metadata$sample
counts = readr::read_tsv("Data/bcbio_output/merged/merged-counts.tsv") %>%
tibble::column_to_rownames("id")
Metadata
metadata %>%
kable() %>%
kable_styling()
| sample | condition | replicate | tissue | |
|---|---|---|---|---|
| WT_tumor_rep1 | WT_tumor_rep1 | WT | rep1 | colon_tumor |
| WT_tumor_rep2 | WT_tumor_rep2 | WT | rep2 | colon_tumor |
| WT_tumor_rep3 | WT_tumor_rep3 | WT | rep3 | colon_tumor |
| Kras_tumor_rep1 | Kras_tumor_rep1 | Kras | rep1 | colon_tumor |
| Kras_tumor_rep2 | Kras_tumor_rep2 | Kras | rep2 | colon_tumor |
| Kras_tumor_rep3 | Kras_tumor_rep3 | Kras | rep3 | colon_tumor |
Load Ensembl ID dataset
ensembl = useMart("ensembl", dataset="mmusculus_gene_ensembl")
symbols = getBM(attributes=c("entrezgene_id", "mgi_symbol", "ensembl_gene_id"), mart=ensembl) %>%
dplyr::rename("geneId"=entrezgene_id) %>%
mutate(geneId=as.character(geneId))
save(symbols, file = "Data/symbols.rda")
We need to do a little it of cleanup before contining. We need to remove peaks that appear in regions known to be false positive machines, called the blacklist regions. We also are interested in regions that could be affecting expression via chromatin accessability, so we’ll only consider peaks within a window around the transcription start sites of genes.
load("Data/symbols.rda")
expand_region_string = function(df, column) {
df = as.data.frame(df)
tokens = str_split_fixed(df[,column], ":", 2)
chrom = tokens[,1]
start = as.numeric(str_split_fixed(tokens[,2], "-", 2)[, 1])
end = as.numeric(str_split_fixed(tokens[,2], "-", 2)[, 2])
regions = data.frame(seqnames=chrom, start=start, end=end)
df %>%
bind_cols(regions)
}
blacklist = readr::read_tsv("Data/mm10-blacklist.v2.bed.gz", col_names=c("seqnames", "start", "end")) %>%
as_granges()
blacklist_peaks = counts %>%
tibble::rownames_to_column("peak") %>%
expand_region_string("peak") %>%
as_granges() %>%
join_overlap_inner(blacklist) %>%
tidy() %>%
pull(peak)
counts = counts[!rownames(counts) %in% blacklist_peaks, ]
A few peaks overlap the ENCODE blacklist regions, so we removed them from the analysis. There were 8440 that overlapped with the blacklist regions.
Here we annotate the peaks that we called with contextual genomic information. We can also see that some peaks are annotated in multiple regions, this can’t be helped as genes overlap so there are areas where multiple features of genes overlap.
annotatedobj = ChIPseeker::annotatePeak(counts %>%
tibble::rownames_to_column("peak") %>%
expand_region_string("peak") %>%
as_granges(), TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene)
## >> preparing features information... 2021-10-07 16:46:35
## >> identifying nearest features... 2021-10-07 16:46:37
## >> calculating distance from peak to TSS... 2021-10-07 16:46:49
## >> assigning genomic annotation... 2021-10-07 16:46:49
## >> assigning chromosome lengths 2021-10-07 16:47:28
## >> done... 2021-10-07 16:47:28
plotAnnoPie(annotatedobj)
pdf('PDF_Figure/Annotation_Pie.pdf',
width = 6,
height = 4)
plotAnnoPie(annotatedobj)
dev.off()
## quartz_off_screen
## 2
upsetplot(annotatedobj)
## Warning: Removed 1660 rows containing non-finite values (stat_count).
pdf('PDF_Figure/UpsetPlot.pdf',
width = 6,
height = 4)
upsetplot(annotatedobj)
## Warning: Removed 1660 rows containing non-finite values (stat_count).
dev.off()
## quartz_off_screen
## 2
Generally we only care about peaks that are close to gene, here we are pretty lenient and consider peaks that are within 1000 bases of a transcription start site of a gene.
annotated = annotatedobj %>%
as.GRanges() %>%
tidy()
ggplot(annotated, aes(distanceToTSS)) +
stat_density(geom="line") +
xlim(c(-10000,10000))
## Warning: Removed 317541 rows containing non-finite values (stat_density).
KEEP_DISTANCE = 1000
close_to_tss = annotated %>%
dplyr::filter(abs(distanceToTSS) < KEEP_DISTANCE) %>%
pull(peak)
counts = counts[rownames(counts) %in% close_to_tss,]
Most peaks fall close to the TSS, but there are some that are very far away from the TSS. We’ll remove peaks that aren’t anywhere near the TSS for a gene from the analysis. We’ll keep only peaks within 1000 bases of a TSS. This leaves us with 36844 peaks to consider.
Here we annotate the peaks post-filtering
annotatedobj = ChIPseeker::annotatePeak(counts %>%
tibble::rownames_to_column("peak") %>%
expand_region_string("peak") %>%
as_granges(), TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene)
## >> preparing features information... 2021-10-07 16:53:45
## >> identifying nearest features... 2021-10-07 16:53:45
## >> calculating distance from peak to TSS... 2021-10-07 16:53:47
## >> assigning genomic annotation... 2021-10-07 16:53:47
## >> assigning chromosome lengths 2021-10-07 16:53:53
## >> done... 2021-10-07 16:53:53
plotAnnoPie(annotatedobj)
pdf('PDF_Figure/Annotation_Pie_filtered.pdf',
width = 6,
height = 4)
plotAnnoPie(annotatedobj)
dev.off()
## quartz_off_screen
## 2
upsetplot(annotatedobj)
pdf('PDF_Figure/UpsetPlot_filtered.pdf',
width = 6,
height = 4)
upsetplot(annotatedobj)
dev.off()
## quartz_off_screen
## 2
Here we look at the condition_Kras_vs_WT coefficient, which will show us the peak affinity differences between the mouse KRasG12D and WT pancreatic epithelial cell samples.
dds = DESeqDataSetFromMatrix(counts, metadata, design=~condition)
## converting counts to integer mode
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Print scale factors for BW normalization
1/sizeFactors(dds)
## WT_tumor_rep1 WT_tumor_rep2 WT_tumor_rep3 Kras_tumor_rep1 Kras_tumor_rep2
## 0.9023553 1.0468537 1.0241625 1.2142010 0.9657078
## Kras_tumor_rep3
## 0.8672074
plotDispEsts(dds)
CUTOFF = 0.05
dds_result <- lfcShrink(dds, coef = "condition_Kras_vs_WT", type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
DESeq2::plotMA(dds_result)
dds_transform <- varianceStabilizingTransformation(dds)
rawCountTable_transform <- as.data.frame(assay(dds_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
png('Figure/Hierchical_Clustering_merged.png')
cim(mat.dist, symkey = FALSE, margins = c(7,7))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Figure/Hierchical_Clustering_merged.png')
pdf('PDF_Figure/Hierchical_Clustering.pdf',
width = 6,
height = 6)
cim(mat.dist, symkey = FALSE, margins = c(7,7))
dev.off()
## quartz_off_screen
## 2
The top 500 most variable genes are selected for PCA analysis.
plotPCA(dds_transform, intgroup = "condition", ntop = 500)
pdf('PDF_Figure/PCA.pdf',
width = 6,
height = 4)
plotPCA(dds_transform, intgroup = "condition", ntop = 500)
dev.off()
## quartz_off_screen
## 2
Signal to noise ratio is calculated using the definition given by Broad GSEA.
markup_deseq2 = function(res) {
resanno = res %>%
expand_region_string("peak") %>%
as_granges()
ChIPseeker::annotatePeak(resanno, TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene) %>%
as.GRanges() %>%
tidy() %>%
left_join(symbols, by="geneId")
}
res = dds_result %>%
as_tibble(rownames = "peak")
# calculate signal-to-noise ratio for GSEA later
s2n <- function(num_list, cond_1 = c(4:6), cond_2 = c(1:3)) {
mean1 <- mean(num_list[cond_1])
if (mean1 == 0) {
mean1 = 1
}
mean2 <- mean(num_list[cond_2])
if (mean2 == 0) {
mean2 = 1
}
sd1 <- sd(num_list[cond_1])
sd2 <- sd(num_list[cond_2])
sd1 <- min(sd1, 0.2*abs(mean1))
sd2 <- min(sd2, 0.2*abs(mean2))
s2nvalue <- (mean1-mean2)/(sd1+sd2)
return(s2nvalue)
}
rawCountTable <- as.data.frame(DESeq2::counts(dds, normalize = TRUE))
res$s2n <- apply(rawCountTable,1,s2n)
res <- markup_deseq2(res)
## >> preparing features information... 2021-10-07 16:55:11
## >> identifying nearest features... 2021-10-07 16:55:11
## >> calculating distance from peak to TSS... 2021-10-07 16:55:13
## >> assigning genomic annotation... 2021-10-07 16:55:13
## >> assigning chromosome lengths 2021-10-07 16:55:20
## >> done... 2021-10-07 16:55:20
res <- res[!duplicated(res$peak),]
write.csv(res, "Result/ATAC_crc_Kras-WT_merged.csv", row.names = FALSE)
rawCountTable <- rawCountTable %>% rownames_to_column(var = "peak")
rawCountTable <- left_join(rawCountTable, res[,c(6,24)], by = c("peak" ="peak"))
write.csv(rawCountTable, "Result/ATAC_crc_normalized_count.csv", row.names = FALSE, na = "")
res_sig = res %>%
dplyr::filter(padj < CUTOFF) %>%
#arrange(pvalue) %>%
#head(200) %>%
pull(peak)
ncounts = DESeq2::counts(dds, normalized=TRUE)
pseudoCount = log2(ncounts + 1)
res_sig_counts = ncounts[rownames(ncounts) %in% res_sig,] %>% apply(1,zscore) %>% t()
my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
png('Figure/KRas vs WT crc ATAC-Seq_merged.png',
width = 600,
height = 1200,
res = 150,
pointsize = 8)
par(cex.main=1.1)
heatmap.2(res_sig_counts,
main = "Peaks with differential affinity\n in KRasG12D colonic tumor\npadj < 0.05",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
col=my_palette,
margins = c(15,2),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
labRow = FALSE,
ylab = "Genes",
cexCol = 2,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('Figure/KRas vs WT crc ATAC-Seq_merged.png')
pdf('PDF_Figure/Heatmap.pdf',
width = 6,
height = 12)
par(cex.main=1.1)
heatmap.2(res_sig_counts,
main = "Peaks with differential affinity\n in KRasG12D colonic tumor\npadj < 0.05",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
col=my_palette,
margins = c(15,2),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
labRow = FALSE,
ylab = "Genes",
cexCol = 2,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
# Scatter plot
res$kras_mean <- rowMeans(pseudoCount[,4:6])
res$wt_mean <- rowMeans(pseudoCount[,1:3])
ggplot(res, aes(x = wt_mean, y = kras_mean)) +
xlab("WT_Average(log2)") + ylab("KRas_Average(log2)") +
geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRasG12D vs WT ATAC-Seq Scatter Plot")
pdf('PDF_Figure/Scatter_Plot.pdf',
width = 5,
height = 4)
ggplot(res, aes(x = wt_mean, y = kras_mean)) +
xlab("WT_Average(log2)") + ylab("KRas_Average(log2)") +
geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRasG12D vs WT ATAC-Seq Scatter Plot")
dev.off()
## quartz_off_screen
## 2
# MA plot
ggplot(res, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq MA Plot")
pdf('PDF_Figure/MA_Plot.pdf',
width = 5,
height = 4)
ggplot(res, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq MA Plot")
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
ggplot(res, aes(x = log2FoldChange, y = -log(padj,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = res, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq Volcano Plot")
## Warning: Removed 17858 rows containing missing values (geom_point).
pdf('PDF_Figure/Volcano_Plot.pdf',
width = 5,
height = 4)
ggplot(res, aes(x = log2FoldChange, y = -log(padj,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = res, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq Volcano Plot")
## Warning: Removed 17858 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Classic GO analysis is performed here for all DE peak-genes detected in this dataset. The reference list is list of peak-genes detected in ATAC-Seq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.
dir.create("Result/GO", showWarnings = FALSE)
target_gene <- res %>% dplyr::filter(padj < 0.05) %>% dplyr::select(ensembl_gene_id) %>% unlist() %>% as.character()
detected_gene <- res %>% dplyr::select(ensembl_gene_id) %>% unlist() %>% as.character()
# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
write.csv(cluster_summary_BP, "Result/GO/GO analysis_BP_merged.csv")
# Run GO enrichment analysis for molecular function
ego_MF <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
write.csv(cluster_summary_MF, "Result/GO/GO analysis_MF_merged.csv")
# Run GO enrichment analysis for cellular component
ego_CC <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
write.csv(cluster_summary_CC, "Result/GO/GO analysis_CC_merged.csv")
pdf("PDF_Figure/GO_BP_Dotplot.pdf",
width = 10,
height = 10)
dotplot(ego_BP, showCategory=20)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_BP_Dotplot.pdf')
ego_BP <- pairwise_termsim(ego_BP)
pdf("PDF_Figure/GO_BP_eMAPplot.pdf",
width = 10,
height = 10)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_BP_eMAPplot.pdf')
OE_foldchanges <- res %>% dplyr::filter(padj < 0.05) %>% dplyr::select(c(log2FoldChange, mgi_symbol))
OE <- OE_foldchanges$log2FoldChange %>% unlist()
names(OE) <- OE_foldchanges$mgi_symbol
pdf("PDF_Figure/GO_BP_cNetplot.pdf",
width = 10,
height = 10)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_BP_cNetplot.pdf')
pdf("PDF_Figure/GO_MF_Dotplot.pdf",
width = 10,
height = 5)
dotplot(ego_MF, showCategory=10)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_MF_Dotplot.pdf')
ego_MF <- pairwise_termsim(ego_MF)
pdf("PDF_Figure/GO_MF_eMAPplot.pdf",
width = 10,
height = 10)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_MF_eMAPplot.pdf')
pdf("PDF_Figure/GO_MF_cNetplot.pdf",
width = 10,
height = 10)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_MF_cNetplot.pdf')
pdf("PDF_Figure/GO_CC_Dotplot.pdf",
width = 10,
height = 5)
dotplot(ego_CC, showCategory=10)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_CC_Dotplot.pdf')
ego_CC <- pairwise_termsim(ego_CC)
pdf("PDF_Figure/GO_CC_eMAPplot.pdf",
width = 10,
height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_CC_eMAPplot.pdf')
pdf("PDF_Figure/GO_CC_cNetplot.pdf",
width = 10,
height = 10)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
include_graphics('PDF_Figure/GO_CC_cNetplot.pdf')
target_gene <- res %>% dplyr::filter(padj < 0.05) %>% dplyr::select(geneId) %>% unlist() %>% as.character()
detected_gene <- res %>% dplyr::select(geneId) %>% unlist() %>% as.character()
kk <- enrichKEGG(gene = target_gene,
universe = detected_gene,
organism = 'mmu',
keyType = "kegg",
pAdjustMethod = "BH",
pvalueCutoff = 0.05)
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
kk.result <- as.data.frame(kk)
pdf("PDF_Figure/KEGG_Dotplot.pdf",
width = 10,
height = 5)
dotplot(kk, showCategory=10)
dev.off()
## quartz_off_screen
## 2
dir.create("Result/KEGG", showWarnings = FALSE)
write.csv(kk.result, "Result/KEGG/KEGG analysis_merged.csv")
The BED files are prepared in R. HOMER was run in the terminal. ## Prepare function for HOMER result parsing
plotHomerResults <- function(homer.table, homer.pwms, n.motifs = 20) {
ncol <- 4
laymat <- matrix(1:((1 + n.motifs) * ncol), ncol = ncol, byrow = FALSE)
logos.list <- lapply(homer.pwms[1:n.motifs],
function(pwm) {
ggseqlogo(pwm) +
theme(axis.text.x = element_blank(),
axis.title.y = element_blank(),
axis.line.y =
element_line(color = "gray"),
axis.ticks.y =
element_line(color = "gray")) +
ylim(0, 2)
})
ranks.text <- sapply(homer.table$Rank[1:n.motifs], textGrob,
simplify = FALSE)
pval.text <- sapply(sprintf("%.0f", -homer.table$log10.p)[1:n.motifs],
textGrob, simplify = FALSE)
targ.text <- sapply(homer.table$freq.targets[1:n.motifs], textGrob,
simplify = FALSE)
bg.text <- sapply(homer.table$freq.bg[1:n.motifs], textGrob,
simplify = FALSE)
tf.text <- sapply(homer.table$best.match.simple[1:n.motifs], textGrob,
simplify = FALSE)
headers <- sapply(c("rank", "motif", "-log10\np-value",
"freq.\ntargets", "freq.\nbackgr.",
"best match"),
textGrob,
simplify = FALSE)
all.plots <- c(headers[1], ranks.text,
headers[2], logos.list,
headers[3], pval.text,
# headers[4], targ.text,
# headers[5], bg.text,
headers[6], tf.text)
grid.arrange(grobs = all.plots, layout_matrix = laymat,
# widths = c(1, 4, 1, 1, 1, 3),
widths = c(1, 4, 1, 3),
ncol = ncol)
}
loadPWM <- function(filename) {
motif <- fread(filename, skip = 1)
if (nrow(motif) > 0) {
pwm <- t(as.matrix(as.data.frame(motif)))
rownames(pwm) <- c("A", "C", "G", "T")
pwm
} else {
NULL
}
}
loadHomerResults <- function(dirname) {
# also allow version="extended" for motifs without stringent
# similarity filtering
homer.table <-
html_nodes(read_html(sprintf("%s/homerResults.html", dirname)), "table")
homer.table <- html_table(homer.table, header = TRUE)[[1]]
homer.table <- data.table(homer.table, check.names = TRUE)
homer.table <- homer.table[, .(Rank,
log10.p = log.P.pvalue / log(10),
freq.targets = X..of.Targets,
freq.bg = X..of.Background,
best.match = Best.Match.Details)]
homer.table[, filename := sprintf("%s/homerResults/motif%s.motif",
dirname, seq_along(Rank))]
homer.table[, best.match.simple := sapply(strsplit(homer.table$best.match, "/"), "[", 1)]
homer.pwms <- sapply(homer.table$filename, loadPWM, simplify = FALSE)
list(homer.table, homer.pwms)
}
Actual HOMER run was done on O2
res.up <- res %>% dplyr::filter(padj < CUTOFF & log2FoldChange > 0) %>% GRanges()
dir.create("Result/Kmer", showWarnings = FALSE)
rtracklayer::export(res.up, "Result/Kmer/KRas_up_ATAC_peaks_merged.bed")
genomic.regions <- "Result/Kmer/KRas_up_ATAC_peaks_merged.bed"
homerdir <- "Result/Kmer/homer-denovo-output-KRas_up_peaks"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output", genomic.regions, homerdir)
print(homer.cmd)
## [1] "findMotifsGenome.pl Result/Kmer/KRas_up_ATAC_peaks_merged.bed mm10 Result/Kmer/homer-denovo-output-KRas_up_peaks -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output"
up.peaks.homer.res <- loadHomerResults(homerdir)
print(up.peaks.homer.res[[1]]$best.match.simple)
## [1] "HOXC13" "SD0001.1_at_AC_acceptor"
## [3] "MYB" "RUNX-AML(Runt)"
## [5] "POL009.1_DCE_S_II" "ZBTB32"
## [7] "PRDM4"
plotHomerResults(up.peaks.homer.res[[1]],
up.peaks.homer.res[[2]], n.motifs = 6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Actual HOMER run was done on O2
res.down <- res %>% dplyr::filter(padj < CUTOFF & log2FoldChange < 0) %>% GRanges()
dir.create("Result/Kmer", showWarnings = FALSE)
rtracklayer::export(res.up, "Result/Kmer/KRas_down_ATAC_peaks_merged.bed")
genomic.regions <- "Result/Kmer/KRas_down_ATAC_peaks_merged.bed"
homerdir <- "Result/Kmer/homer-denovo-output-KRas_down_peaks"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output", genomic.regions, homerdir)
print(homer.cmd)
## [1] "findMotifsGenome.pl Result/Kmer/KRas_down_ATAC_peaks_merged.bed mm10 Result/Kmer/homer-denovo-output-KRas_down_peaks -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output"
down.peaks.homer.res <- loadHomerResults(homerdir)
print(down.peaks.homer.res[[1]]$best.match.simple)
## [1] "HOXC13" "SD0001.1_at_AC_acceptor"
## [3] "SOX15" "POL009.1_DCE_S_II"
## [5] "ZBTB32" "PB0208.1_Zscan4_2"
plotHomerResults(down.peaks.homer.res[[1]],
down.peaks.homer.res[[2]], n.motifs = 6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] enrichplot_1.12.2
## [2] data.table_1.14.2
## [3] Rsamtools_2.8.0
## [4] Biostrings_2.60.2
## [5] XVector_0.32.0
## [6] rtracklayer_1.52.1
## [7] gridExtra_2.3
## [8] ggseqlogo_0.1
## [9] rvest_1.0.1
## [10] fgsea_1.18.0
## [11] pathview_1.32.0
## [12] org.Mm.eg.db_3.13.0
## [13] DOSE_3.18.3
## [14] clusterProfiler_4.0.5
## [15] mosaic_1.8.3
## [16] ggridges_0.5.3
## [17] mosaicData_0.20.2
## [18] ggformula_0.10.1
## [19] ggstance_0.3.5
## [20] Matrix_1.3-4
## [21] mixOmics_6.16.3
## [22] lattice_0.20-45
## [23] MASS_7.3-54
## [24] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
## [25] GenomicFeatures_1.44.2
## [26] AnnotationDbi_1.54.1
## [27] ChIPseeker_1.28.3
## [28] biomaRt_2.48.3
## [29] gplots_3.1.1
## [30] RColorBrewer_1.1-2
## [31] ggrepel_0.9.1
## [32] biobroom_1.24.0
## [33] broom_0.7.9
## [34] plyranges_1.12.1
## [35] kableExtra_1.3.4
## [36] knitr_1.36
## [37] DESeq2_1.32.0
## [38] SummarizedExperiment_1.22.0
## [39] Biobase_2.52.0
## [40] MatrixGenerics_1.4.3
## [41] matrixStats_0.61.0
## [42] GenomicRanges_1.44.0
## [43] GenomeInfoDb_1.28.4
## [44] IRanges_2.26.0
## [45] S4Vectors_0.30.2
## [46] BiocGenerics_0.38.0
## [47] forcats_0.5.1
## [48] stringr_1.4.0
## [49] dplyr_1.0.7
## [50] purrr_0.3.4
## [51] readr_2.0.2
## [52] tidyr_1.1.4
## [53] tibble_3.1.5
## [54] ggplot2_3.3.5
## [55] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2
## [2] tidyselect_1.1.1
## [3] htmlwidgets_1.5.4
## [4] RSQLite_2.2.8
## [5] BiocParallel_1.26.2
## [6] scatterpie_0.1.7
## [7] munsell_0.5.0
## [8] withr_2.4.2
## [9] colorspace_2.0-2
## [10] GOSemSim_2.18.1
## [11] filelock_1.0.2
## [12] highr_0.9
## [13] rstudioapi_0.13
## [14] labeling_0.4.2
## [15] KEGGgraph_1.52.0
## [16] GenomeInfoDbData_1.2.6
## [17] polyclip_1.10-0
## [18] bit64_4.0.5
## [19] farver_2.1.0
## [20] downloader_0.4
## [21] vctrs_0.3.8
## [22] treeio_1.16.2
## [23] generics_0.1.0
## [24] xfun_0.26
## [25] BiocFileCache_2.0.0
## [26] R6_2.5.1
## [27] graphlayouts_0.7.1
## [28] locfit_1.5-9.4
## [29] bitops_1.0-7
## [30] cachem_1.0.6
## [31] gridGraphics_0.5-1
## [32] DelayedArray_0.18.0
## [33] assertthat_0.2.1
## [34] vroom_1.5.5
## [35] BiocIO_1.2.0
## [36] scales_1.1.1
## [37] ggraph_2.0.5
## [38] gtable_0.3.0
## [39] tidygraph_1.2.0
## [40] rlang_0.4.11
## [41] genefilter_1.74.0
## [42] systemfonts_1.0.2
## [43] splines_4.1.1
## [44] lazyeval_0.2.2
## [45] selectr_0.4-2
## [46] mosaicCore_0.9.0
## [47] yaml_2.2.1
## [48] reshape2_1.4.4
## [49] modelr_0.1.8
## [50] crosstalk_1.1.1
## [51] backports_1.2.1
## [52] qvalue_2.24.0
## [53] tools_4.1.1
## [54] ggplotify_0.1.0
## [55] ellipsis_0.3.2
## [56] jquerylib_0.1.4
## [57] ggdendro_0.1.22
## [58] Rcpp_1.0.7
## [59] plyr_1.8.6
## [60] progress_1.2.2
## [61] zlibbioc_1.38.0
## [62] RCurl_1.98-1.5
## [63] prettyunits_1.1.1
## [64] viridis_0.6.1
## [65] cowplot_1.1.1
## [66] haven_2.4.3
## [67] fs_1.5.0
## [68] magrittr_2.0.1
## [69] RSpectra_0.16-0
## [70] DO.db_2.9
## [71] reprex_2.0.1
## [72] ggnewscale_0.4.5
## [73] hms_1.1.1
## [74] patchwork_1.1.1
## [75] evaluate_0.14
## [76] xtable_1.8-4
## [77] leaflet_2.0.4.1
## [78] XML_3.99-0.8
## [79] readxl_1.3.1
## [80] ggupset_0.3.0
## [81] compiler_4.1.1
## [82] ellipse_0.4.2
## [83] shadowtext_0.0.9
## [84] KernSmooth_2.23-20
## [85] crayon_1.4.1
## [86] htmltools_0.5.2
## [87] corpcor_1.6.10
## [88] ggfun_0.0.4
## [89] tzdb_0.1.2
## [90] geneplotter_1.70.0
## [91] aplot_0.1.1
## [92] lubridate_1.7.10
## [93] DBI_1.1.1
## [94] tweenr_1.0.2
## [95] dbplyr_2.1.1
## [96] rappdirs_0.3.3
## [97] boot_1.3-28
## [98] cli_3.0.1
## [99] igraph_1.2.6
## [100] pkgconfig_2.0.3
## [101] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [102] GenomicAlignments_1.28.0
## [103] xml2_1.3.2
## [104] rARPACK_0.11-0
## [105] ggtree_3.0.4
## [106] svglite_2.0.0
## [107] annotate_1.70.0
## [108] webshot_0.5.2
## [109] yulab.utils_0.0.2
## [110] digest_0.6.28
## [111] graph_1.70.0
## [112] rmarkdown_2.11
## [113] cellranger_1.1.0
## [114] fastmatch_1.1-3
## [115] tidytree_0.3.5
## [116] restfulr_0.0.13
## [117] curl_4.3.2
## [118] gtools_3.9.2
## [119] rjson_0.2.20
## [120] lifecycle_1.0.1
## [121] nlme_3.1-153
## [122] jsonlite_1.7.2
## [123] viridisLite_0.4.0
## [124] fansi_0.5.0
## [125] labelled_2.8.0
## [126] pillar_1.6.3
## [127] plotrix_3.8-2
## [128] KEGGREST_1.32.0
## [129] fastmap_1.1.0
## [130] httr_1.4.2
## [131] survival_3.2-13
## [132] GO.db_3.13.0
## [133] glue_1.4.2
## [134] png_0.1-7
## [135] Rgraphviz_2.36.0
## [136] bit_4.0.4
## [137] ggforce_0.3.3
## [138] stringi_1.7.5
## [139] blob_1.2.2
## [140] org.Hs.eg.db_3.13.0
## [141] caTools_1.18.2
## [142] memoise_2.0.0
## [143] ape_5.5